library(maps)
### If using fake data
## dir='/Users/coryzigler/Dropbox/Research/HEI Accountability Grant/PM10 Accountability/Data/'

### If using real data
## dir='/Users/coryzigler/Data/Medicare/Linked AQS-Medicare/HEI PM10 Accountability/'

load(paste(dir, 'alldata.RData', sep=""))
## reads in a object called 'alldat'

## For some reason the Arithmetic.Mean variable needs to be converted to numeric
alldat$Arithmetic.Mean = as.numeric(alldat$Arithmetic.Mean)

dat = alldat

## Select years
Years = 1990:2010
dat = subset(dat, Year %in% Years)

## Select only the variables that will be used
dat = dat[, names(dat) %in% c("FIPS", "Year", "Monitor", "Latitude", "Longitude", "Parameter.Name", "Sample.Duration",
                              "Metric.Used", "State.Name", "County.Name",  
                              "Arithmetic.Mean", "X98th.Percentile", "X95th.Percentile", "X90th.Percentile",                     
                              "mean_age", "Female_rate", "White_rate", "Black_rate", "Total_death", "Tot_Beneficiary", "pyear", "ALL_CVD",                       
                              "AMI", "COPD", "CV_stroke", "HF", "HRD", "IHD", "PVD", "RTI", "HRP", "Respiratory", "a",
                              "Median_income", "HS_rate", "Urban_rate", "Migration_5_year_rate", "Hispanic_cens", "White_cens", "Black_cens", 
                              "Other_cens", "Current_Smoking_rate_weighted", "HOUSEUNIT", "POPULATION", "Female_cens", "county_ID", 
                              "Dew_point_annual_mean", "Dew_point_annual_STD", "temperature_annual_mean", "temperature_annual_STD", 
                              "temperature_annual_max_mean", "temperature_annual_min_mean", "Weather_total_site")]


## Reshape
datwide = reshape(dat, direction = "wide", 
      v.names = c(  "mean_age", "Female_rate", "White_rate", "Black_rate", "Total_death", "Tot_Beneficiary", "pyear", "ALL_CVD",                       
                    "AMI", "COPD", "CV_stroke", "HF", "HRD", "IHD", "PVD", "RTI", "HRP", "Respiratory",
                    "Arithmetic.Mean", "X98th.Percentile", "X95th.Percentile", "X90th.Percentile", 
                     "Latitude", "Longitude", "Sample.Duration", "Metric.Used"), 
                    timevar = "Year", idvar = "Monitor")

# Some variables that seem like they shouldn't be time varying are.  Check with the code snippet below if you wish.
#whichvar = "MONITORING_OBJECTIVE"
#which( rowSums(datwide[, paste(whichvar, ".", 2000:2010, sep="")] != datwide[, paste(whichvar, ".1999", sep="")], na.rm=TRUE) > 0)
#datwide[3171, paste(whichvar, ".", 1999:2014, sep="")]

### Need to select a single lat/long for each monitor.  If you just take one year, then latitude and longitude will be missing for any monitor
### that was not in the data for that year (e.g., if you take 1990 then you will miss a monitor that didn't start until 1999, and if you take 1999
### then you will miss a monitor that stopped operating in 1991).

getlast = function(x){
  return(x[max(which(!is.na(x)))])
}

datwide$Longitude = apply(datwide[, paste("Longitude.", Years, sep="")], 1, getlast)
datwide$Latitude = apply(datwide[, paste("Latitude.", Years, sep="")], 1, getlast)
# Same for Monitoring Objective (which no longer appears in the data as of 4/2015)
#datwide$MONITORING_OBJECTIVE = apply(datwide[, paste("MONITORING_OBJECTIVE.", Years, sep="")], 1, getlast)

datwide = datwide[, !(names(datwide) %in% paste("Longitude.", Years, sep=""))]
datwide = datwide[, !(names(datwide) %in% paste("Latitude.", Years, sep=""))]
#datwide = datwide[, !(names(datwide) %in% paste("MONITORING_OBJECTIVE.", Years, sep=""))]

#############################################
######     Calculate Variables          #####
#############################################

#Baseline and follow up PM 
makeavgvar=function(d,varprefix, years, whichdata){
tempmat=d[, names(datwide) %in% paste(varprefix,".", years,whichdata, sep="") ]
return(rowMeans(tempmat, na.rm=TRUE))
}

## Baseline PM10 - use 1990
datwide$pm10base1990 = datwide$Arithmetic.Mean.1990

## Baseline PM10 - use 1990 - 1994
datwide$pm10base1990_1994 = makeavgvar(d=datwide, varprefix="Arithmetic.Mean", years=c("1990", "1991", "1992", "1993", "1994"), "")

## Follow up PM10 - use average of 1999-2001
datwide$pm10fu = makeavgvar(d=datwide, varprefix="Arithmetic.Mean", years=c("1999", "2000", "2001"), "")

## Follow up PM10 - use average 95th %tile from 1999-2001
datwide$pm10fu95 = makeavgvar(d=datwide, varprefix="X95th.Percentile", years=c("1999", "2000", "2001"), "")

## Monitoring Objective - Need to combine some categories here
#datwide$Objective = NA
#keepcategories =  c("POPULATION EXPOSURE", "HIGHEST CONCENTRATION", "UNKNOWN", "GENERAL/BACKGROUND")
#datwide$Objective[datwide$MONITORING_OBJECTIVE %in% keepcategories] = datwide$MONITORING_OBJECTIVE[datwide$MONITORING_OBJECTIVE %in% keepcategories]
#datwide$Objective[!(datwide$MONITORING_OBJECTIVE %in% keepcategories)] = "OTHER"
#datwide$Objective[is.na(datwide$MONITORING_OBJECTIVE)] = "UNKNOWN"
#datwide$Objective = as.factor(datwide$MONITORING_OBJECTIVE)

## Housing Density (from census)
datwide$housedens=datwide$HOUSEUNIT/datwide$POPULATION

datwide$loginc=log(datwide$Median_income)
datwide$logpop=log(datwide$POPULATION)
datwide$lhousedens = log(datwide$housedens)


###############################################
# Make Data Set for Imputing Missing Baseline #
###############################################
## Restrict to Western US
west = subset(datwide, Latitude > -130 & Longitude < -100 & Longitude > -130)
#save(west, file = 'alldatwest1990_2010.RData')


## Remove observations that have (missing baseline AND follow up PM) OR (missing covariates)
## Don't worry about Medicare data here because this data set will be used to impute pollution only (and does not require Medicare data)
west_restrict = subset(west, (!is.na(pm10base1990_1994) | !is.na(pm10fu)) & ( !is.na(Median_income) & !is.na(Current_Smoking_rate_weighted)))

## Look at Missingness
#with(west, table(is.na(Tot_Beneficiary.2001), is.na(pm10base1990_1994), is.na(pm10fu)))
#with(west, table(is.na(Tot_Beneficiary.2001), Monitor.Objective.2001))
#with(west, table(is.na(Tot_Beneficiary.2001), is.na(Monitor.Objective.2001)))

#west_restrict = subset(west, !is.na(Tot_Beneficiary.2001) & (!is.na(pm10fu) | !is.na(pm10base1990)) & !is.na(Current_Smoking_rate_weighted))
#library(maps)
#map("state")
#with(subset(west_restrict, !is.na(Tot_Beneficiary.2001) & !is.na(pm10fu)), points(Longitude, Latitude, pch = 16 - 15*is.na(pm10base1990_1994), col = a+1))



## Use this for imputing baseline data
datforimpute = west_restrict
save(datforimpute, file=paste(dir, 'datforimpute.RData', sep=""))

